numpymatplotlib/seabornplotlyimport these into our programimport matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('poster')
import pandas as pd
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
DataFrame, which is a bit like an Excel worksheet - it uses columns and row to store data in a tabular formatsitename is not the same as SiteName)original_data = pd.read_csv('VBV_Data\VBV_20170131_JTC00567_shay_2940.csv', low_memory=False)
original_data.head(15)
| RecordedDataTime | SiteName | SpeedMPH | Class | Length(Metres) | Chasis Ht | Info | Gap | Lane | Direction | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-06-01 00:01:04 | JTC00567 | 68 | 2 | 4 | NaN | Car / Small Van <5.2 metres | NaN | 1 | S |
| 1 | 2016-06-01 00:01:09 | JTC00567 | 64 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 49.0 | 1 | S |
| 2 | 2016-06-01 00:01:12 | JTC00567 | 57 | 5 | 18 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | NaN | 1 | N |
| 3 | 2016-06-01 00:01:25 | JTC00567 | 76 | 4 | 6 | NaN | Rigid HGV / Large Van | 158.0 | 1 | S |
| 4 | 2016-06-01 00:02:30 | JTC00567 | 58 | 4 | 8 | NaN | Rigid HGV / Large Van | NaN | 2 | N |
| 5 | 2016-06-01 00:02:30 | JTC00567 | 57 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 774.0 | 1 | N |
| 6 | 2016-06-01 00:03:05 | JTC00567 | 53 | 5 | 15 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 1002.0 | 1 | S |
| 7 | 2016-06-01 00:04:41 | JTC00567 | 58 | 5 | 18 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 950.0 | 1 | S |
| 8 | 2016-06-01 00:04:44 | JTC00567 | 77 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 1335.0 | 1 | N |
| 9 | 2016-06-01 00:04:53 | JTC00567 | 66 | 2 | 3 | NaN | Car / Small Van <5.2 metres | 110.0 | 1 | S |
| 10 | 2016-06-01 00:04:58 | JTC00567 | 57 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 138.0 | 1 | N |
| 11 | 2016-06-01 00:05:08 | JTC00567 | 51 | 5 | 16 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 146.0 | 1 | S |
| 12 | 2016-06-01 00:05:10 | JTC00567 | 52 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 21.0 | 1 | S |
| 13 | 2016-06-01 00:05:50 | JTC00567 | 61 | 2 | 4 | NaN | Car / Small Van <5.2 metres | 507.0 | 1 | N |
| 14 | 2016-06-01 00:06:04 | JTC00567 | 48 | 5 | 17 | NaN | Rigid HGV + Trailer / Standard Articulated (Ca... | 141.0 | 1 | N |
pandas, we can start to manipulate it.pandas) can manipulate dates and times, if they're the correct kind of object.# Create a "Count" column, which will be 1 for each record to begin wth
original_data['Count'] = 1
# Make sure dates are used as dates, and set them as the index so we can summarise
original_data['DateTime'] = pd.to_datetime(original_data['RecordedDataTime'])
original_data.set_index('DateTime', inplace=True)
# Work out the vehicle count per class and direction for each hour
hourly_counts = original_data.groupby([pd.TimeGrouper('1H'), 'Info'])['Count'].count().reset_index()
hourly_counts.head(15)
| DateTime | Info | Count | |
|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | 48 |
| 1 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres with Trailer | 2 |
| 2 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 27 |
| 3 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | 11 |
| 4 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | 35 |
| 5 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | 3 |
| 6 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 31 |
| 7 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | 9 |
| 8 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres | 30 |
| 9 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres with Trailer | 2 |
| 10 | 2016-06-01 02:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 39 |
| 11 | 2016-06-01 02:00:00 | Rigid HGV / Large Van | 17 |
| 12 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres | 61 |
| 13 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres with Trailer | 1 |
| 14 | 2016-06-01 03:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | 36 |
# Work out the vehicle count per class and direction for each hour
hourly_counts_d = original_data.groupby([pd.TimeGrouper('1H'), 'Info', 'Direction'])['Count'].count().reset_index()
hourly_counts_d.head(15)
| DateTime | Info | Direction | Count | |
|---|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | N | 26 |
| 1 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | S | 22 |
| 2 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres with Trailer | N | 2 |
| 3 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | N | 15 |
| 4 | 2016-06-01 00:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | S | 12 |
| 5 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | N | 8 |
| 6 | 2016-06-01 00:00:00 | Rigid HGV / Large Van | S | 3 |
| 7 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | N | 19 |
| 8 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | S | 16 |
| 9 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | N | 1 |
| 10 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres with Trailer | S | 2 |
| 11 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | N | 14 |
| 12 | 2016-06-01 01:00:00 | Rigid HGV + Trailer / Standard Articulated (Ca... | S | 17 |
| 13 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | N | 7 |
| 14 | 2016-06-01 01:00:00 | Rigid HGV / Large Van | S | 2 |
hourly_counts.to_csv('JTC00567 Hourly Summaries.csv', index=False)
hourly_counts_d.to_csv('JTC00567 Directional Hourly Summaries.csv', index=False)
DataFrames to Excel# Create a "writer"
excel_writer = pd.ExcelWriter('JTC00567.xlsx')
# Write each DataFrame to the writer separately, specifying the name we want to use for the worksheet
original_data.to_excel(excel_writer, 'Raw Counts', index=False)
hourly_counts.to_excel(excel_writer, 'Hourly Counts', index=False)
hourly_counts_d.to_excel(excel_writer, 'Hourly Counts_Directional', index=False)
# Save the "writer"
excel_writer.save()
matplotlib is a plotting library for Python, allowing for the production of a wide range of graphsseaborn extends matplotlib by adding support for further types of graphs, improved interaction with pandas dataframes and generally by making matplotlib graphs prettierseaborn into our project, we use its colour schemes even when we plot graphs with matplotlib (which in some cases is easier)matplotlib can plot line charts from our data, and has a special interface for dealing with datesgroupby# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
# Get the data for each class in the DataFrame
for vehicle_class, data in hourly_counts.groupby('Info'):
# Plot the data for that class
ax.plot_date(data['DateTime'], data['Count'], label=vehicle_class, linestyle='-', marker='')
# Set up legend
plt.legend(frameon=True, facecolor='white',
loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.6))
# Set up axis labels
plt.xlabel('Date')
plt.ylabel('Vehicles')
# Fix margins properly
plt.ylim(bottom=0)
ax.margins(x=0)
plt.title('Total Flow - JTC00567')
<matplotlib.text.Text at 0x2384b5e8710>
subplots# Set up the subplots
fig, all_axes = plt.subplots(2, figsize=(15, 10), sharex=True, sharey=True)
# Get the data for each class in the DataFrame
for i, (direction, directional_data) in enumerate(hourly_counts_d.groupby('Direction')):
for vehicle_class, data in directional_data.groupby('Info'):
# Plot the data for that class
all_axes[i].plot_date(data['DateTime'], data['Count'], label=vehicle_class, linestyle='-', marker='')
all_axes[i].margins(x=0)
all_axes[i].set_title(direction)
all_axes[i].set_ylabel('Vehicles')
# Set up legend
plt.legend(frameon=True, facecolor='white',
loc='lower center', ncol=2, bbox_to_anchor=(0.5, -0.7))
# Set up labels
plt.xlabel('Date')
fig.suptitle('Flow by Direction - JTC00567')
# Fix margins properly
plt.ylim(bottom=0)
(0, 1582.3)
pandas includes a method that lets a DataFrame be exported as a histogram# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
# Filter our original DataFrame to only contain cars (class 2)
cars = original_data[original_data['Class'] == '2']
# Set up the bins we want to use in our histogram
bin_size = 10
bins = range(0,max(cars['SpeedMPH']) + bin_size, bin_size)
# Plot the histogram
cars.hist(column='SpeedMPH', bins=bins, ax=ax, alpha=0.5)
# Set up labels and title
plt.xlabel('Speed (mph)')
plt.ylabel('Number of Vehicles')
plt.title('Car Speeds Histogram')
# Fix the margins
ax.margins(x=0)
seaborn over matplotlib is the ease with which heatmaps can be madenb_cars = hourly_counts_d[(hourly_counts_d['Info'] == 'Car / Small Van <5.2 metres') &
(hourly_counts_d['Direction'] == 'N')].copy()
nb_cars['Day'] = nb_cars['DateTime'].dt.weekday_name
nb_cars['Hour'] = nb_cars['DateTime'].dt.time#hour.astype('str')
nb_cars.head(15)
| DateTime | Info | Direction | Count | Day | Hour | |
|---|---|---|---|---|---|---|
| 0 | 2016-06-01 00:00:00 | Car / Small Van <5.2 metres | N | 26 | Wednesday | 00:00:00 |
| 7 | 2016-06-01 01:00:00 | Car / Small Van <5.2 metres | N | 19 | Wednesday | 01:00:00 |
| 15 | 2016-06-01 02:00:00 | Car / Small Van <5.2 metres | N | 19 | Wednesday | 02:00:00 |
| 23 | 2016-06-01 03:00:00 | Car / Small Van <5.2 metres | N | 38 | Wednesday | 03:00:00 |
| 31 | 2016-06-01 04:00:00 | Car / Small Van <5.2 metres | N | 79 | Wednesday | 04:00:00 |
| 43 | 2016-06-01 05:00:00 | Car / Small Van <5.2 metres | N | 256 | Wednesday | 05:00:00 |
| 54 | 2016-06-01 06:00:00 | Car / Small Van <5.2 metres | N | 561 | Wednesday | 06:00:00 |
| 66 | 2016-06-01 07:00:00 | Car / Small Van <5.2 metres | N | 696 | Wednesday | 07:00:00 |
| 78 | 2016-06-01 08:00:00 | Car / Small Van <5.2 metres | N | 688 | Wednesday | 08:00:00 |
| 90 | 2016-06-01 09:00:00 | Car / Small Van <5.2 metres | N | 767 | Wednesday | 09:00:00 |
| 102 | 2016-06-01 10:00:00 | Car / Small Van <5.2 metres | N | 900 | Wednesday | 10:00:00 |
| 114 | 2016-06-01 11:00:00 | Car / Small Van <5.2 metres | N | 818 | Wednesday | 11:00:00 |
| 126 | 2016-06-01 12:00:00 | Car / Small Van <5.2 metres | N | 811 | Wednesday | 12:00:00 |
| 139 | 2016-06-01 13:00:00 | Car / Small Van <5.2 metres | N | 852 | Wednesday | 13:00:00 |
| 151 | 2016-06-01 14:00:00 | Car / Small Van <5.2 metres | N | 874 | Wednesday | 14:00:00 |
column_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
pivot = nb_cars.pivot(index='Day', columns='Hour', values='Count')
pivot = pivot.reindex_axis(column_order, axis=0)
pivot
| Hour | 00:00:00 | 01:00:00 | 02:00:00 | 03:00:00 | 04:00:00 | 05:00:00 | 06:00:00 | 07:00:00 | 08:00:00 | 09:00:00 | ... | 14:00:00 | 15:00:00 | 16:00:00 | 17:00:00 | 18:00:00 | 19:00:00 | 20:00:00 | 21:00:00 | 22:00:00 | 23:00:00 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Day | |||||||||||||||||||||
| Sunday | 74 | 39 | 30 | 32 | 41 | 101 | 215 | 325 | 453 | 701 | ... | 830 | 838 | 839 | 615 | 524 | 392 | 305 | 154 | 85 | 80 |
| Monday | 40 | 16 | 22 | 46 | 176 | 382 | 543 | 716 | 697 | 868 | ... | 924 | 1073 | 1173 | 667 | 523 | 363 | 291 | 151 | 81 | 48 |
| Tuesday | 22 | 19 | 20 | 23 | 82 | 305 | 515 | 631 | 649 | 709 | ... | 828 | 1001 | 1150 | 690 | 516 | 408 | 290 | 177 | 67 | 69 |
| Wednesday | 26 | 19 | 19 | 38 | 79 | 256 | 561 | 696 | 688 | 767 | ... | 874 | 1062 | 1130 | 797 | 561 | 397 | 309 | 174 | 114 | 124 |
| Thursday | 62 | 26 | 15 | 37 | 83 | 292 | 536 | 648 | 715 | 818 | ... | 927 | 1093 | 1168 | 861 | 635 | 482 | 313 | 212 | 102 | 43 |
| Friday | 20 | 23 | 24 | 33 | 77 | 242 | 475 | 635 | 631 | 839 | ... | 1226 | 1320 | 1507 | 1278 | 852 | 542 | 337 | 253 | 146 | 88 |
| Saturday | 48 | 29 | 36 | 37 | 85 | 186 | 351 | 531 | 756 | 1163 | ... | 970 | 914 | 775 | 613 | 461 | 320 | 218 | 202 | 151 | 108 |
7 rows × 24 columns
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(pivot)
<matplotlib.axes._subplots.AxesSubplot at 0x238536c3390>
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(pivot, annot=True, fmt='d', annot_kws={'size': 12})
<matplotlib.axes._subplots.AxesSubplot at 0x238536c3780>
import os
all_data = pd.concat([pd.read_csv(os.path.join('VBV_Data', f), low_memory=False)
for f in os.listdir('VBV_Data')
if f.startswith('VBV')])
all_cars = all_data[all_data['Info'].str.contains('Car')]
# Set up the graph size
fig, ax = plt.subplots(figsize=(15, 8))
box_plot = sns.boxplot(data=all_cars, y='SpeedMPH', x='SiteName')
for lbl in box_plot.get_xticklabels():
lbl.set_rotation(90)
ax.margins(y=0)
plotly is a different data visualisation library that allows for the creation of interactive graphsmatplotliblines = []
for vehicle_class, data in hourly_counts.groupby('Info'):
# Set up the line plot for each vehicle type, and append to a list
line = go.Scatter(x=data['DateTime'], y=data['Count'], name=vehicle_class)
lines.append(line)
# Set up layout
layout = go.Layout(legend=dict(orientation='h',
xanchor='center',
y=-0.2,
x=0.5),
xaxis=dict(title='Date'),
yaxis=dict(title='Vehicle Count'),
autosize=False,
width=1000,
height=600)
# Combine lines and layout, and plot
fig = go.Figure(data=lines, layout=layout)
plotly.offline.iplot(fig)
# Set up empty lists
lines = []
type_counts = []
for direction, directional_data in hourly_counts_d.groupby('Direction'):
# Initialise counter
i = 0
for vehicle_class, data in directional_data.groupby('Info'):
# Set up the line plot for each vehicle type, and append to a list
line = go.Scatter(x=data['DateTime'], y=data['Count'], name=vehicle_class,
visible=not len(type_counts))
lines.append(line)
# Increment counter
i += 1
# Add the number of types for this direction to the list
type_counts.append(i)
# Set up update menus
updatemenus = list([
dict(active=0,
showactive=True,
buttons=list([
dict(label = 'N',
method = 'update',
args = [{'visible': [True] * type_counts[0] + [False] * type_counts[1]},
{'title': 'N'}]),
dict(label = 'S',
method = 'update',
args = [{'visible': [False] * type_counts[0] + [True] * type_counts[1]},
{'title': 'S'}])
])
)
])
# Set up layout
layout = go.Layout(legend=dict(orientation='h',
xanchor='center',
y=-0.2,
x=0.5),
xaxis=dict(title='Date'),
yaxis=dict(title='Vehicle Count'),
autosize=False,
width=1000,
height=600,
updatemenus=updatemenus)
# Combine lines and layout, and plot
fig = go.Figure(data=lines, layout=layout)
plotly.offline.iplot(fig)
matplotlib# Set up the histogram
hist = go.Histogram(x=cars['SpeedMPH'], opacity=0.5,
autobinx=False, xbins=dict(start=0,
end=max(cars['SpeedMPH']),
size=10))
# Set up the layout
layout = go.Layout(xaxis=dict(title='Speed (mph)'),
yaxis=dict(title='Number of Vehicles'),
title='Car Speeds Histogram',
autosize=False,
width=1000,
height=600)
# Combine the histogram and layout
figure = go.Figure(data=[hist], layout=layout)
plotly.offline.iplot(figure)
matplotlib won't work here.# We need to sort our data so the y axis is shown in the correct order
column_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
nb_cars['Day'] = pd.Categorical(nb_cars['Day'],
categories=column_order,
ordered=True)
nb_cars.sort_values(['Day', 'Hour'], inplace=True)
# Generate the heatmap
heat = go.Heatmap(x=nb_cars['Hour'],
y=nb_cars['Day'],
z=nb_cars['Count'],
hoverinfo='z')
# Generate the layout
layout = go.Layout(xaxis=dict(title='Hour'),
yaxis=dict(autorange='reversed'),
title='Car Counts Heatmap',
autosize=False,
width=1000,
height=600)
# Combine the data and layout
figure = go.Figure(data=[heat], layout=layout)
plotly.offline.iplot(figure)
# Set up the individual boxes
boxes = [go.Box(y=data['SpeedMPH'], name=site, boxmean=True)
for site, data in all_cars.groupby('SiteName')]
# Set up the layout
layout = go.Layout(xaxis=dict(title='Site'),
yaxis=dict(title='Speed (mph)'),
title='Car Speeds Box Plot',
autosize=False,
width=1000,
height=600)
# Combine the data and layout
figure = go.Figure(data=boxes, layout=layout)
plotly.offline.iplot(figure)
Pandas has many further functions, and there are a lot of other libraries to use in data processing, including database libraries such as sqlite3Bokeh and HoloViews